{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import os\n",
    "import gc\n",
    "from gensim.models import Word2Vec\n",
    "from tensorflow.keras.layers import (Bidirectional,\n",
    "                                     Embedding,\n",
    "                                     GRU, \n",
    "                                     GlobalAveragePooling1D,\n",
    "                                     GlobalMaxPooling1D,\n",
    "                                     Concatenate,\n",
    "                                     SpatialDropout1D,\n",
    "                                     BatchNormalization,\n",
    "                                     Dropout,\n",
    "                                     Dense,\n",
    "                                     Activation,\n",
    "                                     concatenate,\n",
    "                                     Input\n",
    "                                    )\n",
    "from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import f1_score\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#读取数据集\n",
    "train = pd.read_csv('../data/train_set.csv')\n",
    "test = pd.read_csv('../data/test_set.csv')\n",
    "\n",
    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000, lower=False,filters=\"\")\n",
    "tokenizer.fit_on_texts(list(train['word_seg'].values)+list(test['word_seg'].values))\n",
    "\n",
    "train_ = tokenizer.texts_to_sequences(train['word_seg'].values)\n",
    "test_ = tokenizer.texts_to_sequences(test['word_seg'].values)\n",
    "\n",
    "train_ = tf.keras.preprocessing.sequence.pad_sequences(train_, maxlen=1800)\n",
    "test_ = tf.keras.preprocessing.sequence.pad_sequences(test_, maxlen=1800)\n",
    "\n",
    "word_vocab = tokenizer.word_index\n",
    "\n",
    "\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tensorflow.keras.utils import to_categorical\n",
    "lb = LabelEncoder()\n",
    "train_label = lb.fit_transform(train['class'].values)\n",
    "train_label = to_categorical(train_label)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#注意，不再这里进行读入"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word Embedding构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "add word2vec finished....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hcq/miniconda3/envs/python3/lib/python3.6/site-packages/ipykernel_launcher.py:24: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "all_data=pd.concat([train['word_seg'],test['word_seg']])\n",
    "file_name = '../embedding/Word2Vec_word_200.model'\n",
    "if not os.path.exists(file_name):\n",
    "    model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],\n",
    "                     size=200, \n",
    "                     window=5,\n",
    "                     iter=10, \n",
    "                     workers=11, \n",
    "                     seed=2018, \n",
    "                     min_count=2)\n",
    "    model.save(file_name)\n",
    "else:\n",
    "    model = Word2Vec.load(file_name)\n",
    "print(\"add word2vec finished....\")    \n",
    "\n",
    "\n",
    "\n",
    "count = 0\n",
    "\n",
    "embedding_matrix = np.zeros((len(word_vocab) + 1, 200))\n",
    "for word, i in word_vocab.items():\n",
    "    embedding_vector = model.wv[word] if word in model else None\n",
    "    if embedding_vector is not None:\n",
    "        count += 1\n",
    "        embedding_matrix[i] = embedding_vector\n",
    "    else:\n",
    "        unk_vec = np.random.random(200) * 0.5\n",
    "        unk_vec = unk_vec - unk_vec.mean()\n",
    "        embedding_matrix[i] = unk_vec\n",
    "        \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model(sent_length, embeddings_weight):\n",
    "    content = Input(shape=(sent_length,), dtype='int32')\n",
    "    embedding = Embedding(\n",
    "        name=\"word_embedding\",\n",
    "        input_dim=embeddings_weight.shape[0],\n",
    "        weights=[embeddings_weight],\n",
    "        output_dim=embeddings_weight.shape[1],\n",
    "        trainable=False)\n",
    "\n",
    "    x = SpatialDropout1D(0.2)(embedding(content))\n",
    "\n",
    "    x = Bidirectional(GRU(200, return_sequences=True))(x)\n",
    "    x = Bidirectional(GRU(200, return_sequences=True))(x)\n",
    "\n",
    "    avg_pool = GlobalAveragePooling1D()(x)\n",
    "    max_pool = GlobalMaxPooling1D()(x)\n",
    "\n",
    "    conc = concatenate([avg_pool, max_pool])\n",
    "\n",
    "    x = Dense(1000)(conc)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Activation(activation=\"relu\")(x)\n",
    "    x = Dropout(0.2)(x)\n",
    "    x = Dense(500)(x)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Activation(activation=\"relu\")(x)\n",
    "    output = Dense(19, activation=\"softmax\")(x)\n",
    "\n",
    "    model = tf.keras.models.Model(inputs=content, outputs=output)\n",
    "    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kf = KFold(n_splits=10, shuffle=True, random_state=666)\n",
    "\n",
    "\n",
    "#交叉验证的验证集的概率结果保存\n",
    "train_pre_matrix = np.zeros((train.shape[0],19)) #记录验证集的概率\n",
    "#测试集的概率结果保存（cv次数，测试集的行数，标签）\n",
    "test_pre_matrix = np.zeros((10,test.shape[0],19)) #将10轮的测试概率分别保存起来\n",
    "cv_scores=[] #每一轮线下的验证成绩\n",
    "\n",
    "for i, (train_fold, test_fold) in enumerate(kf.split(train_)):\n",
    "    print(\"第%s的结果\"%i)\n",
    "    X_train, X_valid = train_[train_fold, :], train_[test_fold, :]\n",
    "    y_train, y_valid = train_label[train_fold], train_label[test_fold]\n",
    "\n",
    "    #在这里进行数据组装，\n",
    "    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(10000).batch(64)\n",
    "    val_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(128)\n",
    "    test_ds = tf.data.Dataset.from_tensor_slices((test_,np.zeros((test_.shape[0],19)))).batch(128)\n",
    "\n",
    "    # 检查点保存至的目录\n",
    "    checkpoint_dir = './cv_checkpoints/cv_'+str(i)+'/'\n",
    "    # 检查点的文件名\n",
    "    checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt_{epoch}\")\n",
    "    model = build_model(1800, embedding_matrix)\n",
    "\n",
    "    early_stopping = EarlyStopping(monitor='val_accuracy', patience=6)\n",
    "    plateau = ReduceLROnPlateau(monitor=\"val_accuracy\", verbose=1, mode='max', factor=0.5, patience=3)\n",
    "    checkpoint = ModelCheckpoint(checkpoint_prefix, monitor='val_accuracy', \n",
    "                                 verbose=2, save_best_only=True, mode='max',save_weights_only=True)\n",
    "\n",
    "    if not os.path.exists(checkpoint_dir):\n",
    "        model.fit(train_ds,\n",
    "                  epochs=30,\n",
    "                  validation_data=val_ds,\n",
    "                  callbacks=[early_stopping, plateau, checkpoint],\n",
    "                  verbose=2)\n",
    "\n",
    "    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))\n",
    "\n",
    "    \n",
    "   #验证集的结果 \n",
    "    valid_prob = model.predict(val_ds)\n",
    "    valid_pred = np.argmax(valid_prob,axis=1)\n",
    "    valid_pred = lb.inverse_transform(valid_pred)\n",
    "\n",
    "    y_valid = np.argmax(y_valid, axis=1)\n",
    "    y_valid = lb.inverse_transform(y_valid)\n",
    "\n",
    "    f1_score_ = f1_score(y_valid,valid_pred,average='macro') \n",
    "    print (\"valid's f1-score: %s\" %f1_score_)\n",
    "\n",
    "\n",
    "    train_pre_matrix[test_fold, :] =  valid_prob\n",
    "\n",
    "    test_pre_matrix[i, :,:]= model.predict(test_ds)\n",
    "    #第一轮的ok\n",
    "    #第二轮？\n",
    "    #GPU 释放\n",
    "\n",
    "    del model; gc.collect()#注意\n",
    "    tf.keras.backend.clear_session()  #注意\n",
    "    \n",
    "np.save(\"cv_test_result.npy\",test_pre_matrix)\n",
    "#模型融合\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 10折概率结果\n",
    "## 结果融合\n",
    " 1、 概率求平均，然后取最大值！作为结果\n",
    " \n",
    " 2、每一折，先取最大值，作为结果\n",
    " 这个结果有10个\n",
    " 【1，1，1，1，2，3，5，9，0，1】 ——>  1:5 2:1 3:1 5:1 9:1 -> 5作为结果\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 结果融合"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 方法一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = np.load(\"cv_test_result.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10, 102277, 19)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(102277, 19)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = np.load(\"cv_test_result.npy\")\n",
    "res_mean = res.mean(axis=0)\n",
    "res_mean.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_pred = lb.inverse_transform(np.argmax(res_mean,axis=1))\n",
    "test['class'] = test_pred\n",
    "test[[\"id\",\"class\"]].to_csv(\"submission_baseline_dnn_cv.csv\",index=False,header=True,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 方法二"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = np.load(\"cv_test_result.npy\")\n",
    "test_pred_matrix = np.zeros((test.shape[0],10),dtype=int)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#每一折进行取概率最大值             \n",
    "for i in range(10):\n",
    "    sub_res = res[i,:,:]\n",
    "    sub_test_pred = lb.inverse_transform(np.argmax(sub_res,axis=1))\n",
    "    test_pred_matrix[:,i]=sub_test_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 5,  5,  5, ...,  5,  5,  5],\n",
       "       [ 4,  4,  4, ...,  4,  4,  4],\n",
       "       [12, 13, 13, ...,  1,  3, 13],\n",
       "       ...,\n",
       "       [ 6,  6,  6, ...,  6,  6,  6],\n",
       "       [14, 14, 14, ..., 14, 14, 14],\n",
       "       [13, 13, 13, ..., 13, 13, 13]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_pred_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a=[5,5,5,5,1,1,1,2,3,4]\n",
    "np.argmax(np.bincount(a))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_pred=[]\n",
    "for line in test_pred_matrix:\n",
    "    test_pred.append(np.argmax(np.bincount(line)))\n",
    "test['class'] = test_pred\n",
    "test[[\"id\",\"class\"]].to_csv(\"submission_baseline_dnn_cv_v2.csv\",index=False,header=True,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
